#Filtering out the null values
me_cars<-me_cars %>% filter(price != " TBD " &
Cylinders != "N/A" &
Trunk.Capacity..liters. != "N/A" &
Trunk.Capacity..liters. != "Null")
#Changing the type of some columns (from char to dbl) and removing strange values
me_cars$price <- gsub(",", "", me_cars$price) %>% as.numeric() %>% round(2)
me_cars$Cylinders <- as.numeric(me_cars$Cylinders)
me_cars$Trunk.Capacity..liters. <- gsub(" \\(.*\\)", "", me_cars$Trunk.Capacity..liters.) %>% as.numeric()
## Warning in gsub(" \\(.*\\)", "", me_cars$Trunk.Capacity..liters.) %>%
## as.numeric(): NAs introduced by coercion
me_cars$Fuel.Tank.Capacity..liters. <- gsub(" \\(.*\\)", "",me_cars$Fuel.Tank.Capacity..liters.) %>% as.numeric()
me_cars$Seating.Capacity <- gsub("Seater", "",me_cars$Seating.Capacity) %>% as.numeric()
me_cars$Wheelbase..meters.<- gsub(" ", "", me_cars$Wheelbase..meters.) %>% as.numeric() %>% round(2)
me_cars$Torque..Nm. %<>% as.numeric
me_cars$Length..meters. %<>% round(2)
me_cars$Width..meters. %<>% round(2)
me_cars$Height..meters. %<>% round(2)
#Changing the type of other columns (from char to fact)
me_cars$Drive.Type %<>% as.factor
me_cars$Fuel.Type %<>% as.factor
me_cars$Transmission %<>% as.factor
me_cars$currency %<>% as.factor
#Renaming some variables so that they are easier to understand
me_cars <- me_cars %>% rename("Engine_Capacity"="Engine.Capacity..liters.",
"Driving"="Drive.Type",
"Fuel_Capacity"="Fuel.Tank.Capacity..liters.",
"Liters_For_100km"="Fuel.Economy..L.100.Km.",
"Fuel_Type"="Fuel.Type",
"Horsepower"="Horsepower..bhp.",
"Torque"="Torque..Nm.",
"Top_Speed"="Top.Speed..Km.h.",
"Seating_Capacity"="Seating.Capacity",
"Acceleration_0100"="Acceleration.0.100.Km.h..sec.",
"Length"="Length..meters.",
"Width"="Width..meters.",
"Height"="Height..meters.",
"Wheelbase"="Wheelbase..meters.",
"Trunk_Capacity"="Trunk.Capacity..liters.",
"Price"="price",
"Currency"="currency",
"Name"="name")
#EXCHANGE RATES RETRIEVED ON JUNE, FRIDAY 23th
me_cars$PriceEURO<-ifelse(me_cars$Currency=="SAR", me_cars$Price*.25,
ifelse(me_cars$Currency=="AED", me_cars$Price*.25,
ifelse(me_cars$Currency=="BHD", me_cars$Price*2.44,
ifelse(me_cars$Currency=="KWD", me_cars$Price*2.99,
ifelse(me_cars$Currency=="OMR", me_cars$Price*2.39, me_cars$Price*.25)))))
#Creating the column Area
me_cars$Area<-ifelse(me_cars$Currency=="SAR", "Saudi Arabia",
ifelse(me_cars$Currency=="AED", "UAE",
ifelse(me_cars$Currency=="BHD", "Bahrain",
ifelse(me_cars$Currency=="KWD", "Kuwait",
ifelse(me_cars$Currency=="OMR", "Oman", "Qatar")))))
me_cars$Area %<>% as.factor
me_cars %>% head()
## Engine_Capacity Cylinders Driving Fuel_Capacity Liters_For_100km
## 1 1.2 3 Front Wheel Drive 42 4.9
## 2 1.2 3 Front Wheel Drive 42 4.9
## 3 1.4 4 Front Wheel Drive 45 6.3
## 4 1.6 4 Front Wheel Drive 50 6.4
## 5 1.5 4 Front Wheel Drive 48 5.8
## 6 1.4 4 Front Wheel Drive 35 5.1
## Fuel_Type Horsepower Torque Transmission Top_Speed Seating_Capacity
## 1 Petrol 76 100 Automatic 170 5
## 2 Petrol 76 100 Automatic 170 5
## 3 Petrol 75 118 Manual 156 2
## 4 Petrol 102 145 Automatic 180 5
## 5 Petrol 112 150 Automatic 170 5
## 6 Petrol 98 127 Automatic 170 5
## Acceleration_0100 Length Width Height Wheelbase Trunk_Capacity
## 1 14.0 4.24 1.67 1.51 2.55 450
## 2 14.0 4.24 1.67 1.51 2.55 450
## 3 16.0 3.86 1.72 1.72 2.51 2800
## 4 11.0 4.35 1.99 1.53 2.63 510
## 5 10.9 4.31 1.81 1.62 2.58 448
## 6 12.0 3.64 1.60 1.48 2.38 314
## Name Price Currency PriceEURO Area
## 1 Mitsubishi Attrage 2021 1.2 GLX (Base) 34099 SAR 8524.75 Saudi Arabia
## 2 Mitsubishi Attrage 2021 1.2 GLX (Base) 34099 SAR 8524.75 Saudi Arabia
## 3 Fiat Fiorino 2021 1.4L Standard 41250 SAR 10312.50 Saudi Arabia
## 4 Renault Symbol 2021 1.6L PE 44930 SAR 11232.50 Saudi Arabia
## 5 MG ZS 2021 1.5L STD 57787 SAR 14446.75 Saudi Arabia
## 6 Chevrolet Spark 2021 1.4L LS 53790 SAR 13447.50 Saudi Arabia
Map for first page of the report
mec_map0 <- data.frame(region=c("Bahrain", "Kuwait", "Qatar", "Oman", "Saudi Arabia", "United Arab Emirates"),
column=c(seq(1,6,1)))
mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map0, by="region")
mapdata<-mapdata %>% filter(!is.na(mapdata$column))
ggplot(mapdata, aes(x=long, y=lat, group=group))+
geom_polygon(aes(fill=region), col="black")+
scale_fill_manual(name="Countries", breaks=c("Bahrain", "Kuwait", "Oman", "Qatar", "Saudi Arabia", "United Arab Emirates"),
values=c("azure","skyblue", "deepskyblue", "dodgerblue", "blue3", "navy"))+
theme(plot.title = element_text(face="bold"),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
rect = element_blank())
me_cars %>% select(-Driving,-Fuel_Type,-Transmission,-Name,-Currency,-Area,-Price) %>% gather() %>%
ggplot(aes(value)) +
geom_boxplot(fill="skyblue", color="black", outlier.colour = "deepskyblue") +
facet_wrap(~key, scales = 'free')+
theme_minimal()
## Warning: Removed 2 rows containing non-finite values (`stat_boxplot()`).
It is clear that there are a lot of outliers. It is important to remove the most extreme ones. The criterion used to decide which is an outlier and which is not is just common sense, i.e. it is impossible that a car is 4500m long.
value = me_cars[,"Height"][me_cars[,"Height"]>500]
me_cars[,"Height"][me_cars[,"Height"] %in% value] = NA
me_cars = drop_na(me_cars)
value = me_cars[,"Length"][me_cars[,"Length"]>500]
me_cars[,"Length"][me_cars[,"Length"] %in% value] = NA
me_cars = drop_na(me_cars)
value = me_cars[,"Wheelbase"][me_cars[,"Wheelbase"]>500]
me_cars[,"Wheelbase"][me_cars[,"Wheelbase"] %in% value] = NA
me_cars = drop_na(me_cars)
value = me_cars[,"Width"][me_cars[,"Width"]>500]
me_cars[,"Width"][me_cars[,"Width"] %in% value] = NA
me_cars = drop_na(me_cars)
value = me_cars[,"Horsepower"][me_cars[,"Horsepower"]>1250]
me_cars[,"Horsepower"][me_cars[,"Horsepower"] %in% value] = NA
me_cars = drop_na(me_cars)
value = me_cars[,"Trunk_Capacity"][me_cars[,"Trunk_Capacity"]>1300]
me_cars[,"Trunk_Capacity"][me_cars[,"Trunk_Capacity"] %in% value] = NA
me_cars = drop_na(me_cars)
value = me_cars[,"PriceEURO"][me_cars[,"PriceEURO"]>750000]
me_cars[,"PriceEURO"][me_cars[,"PriceEURO"] %in% value] = NA
me_cars = drop_na(me_cars)
#REMOVE DUPLICATES
me_cars %<>% distinct()
me_cars %>% select(-Driving,-Fuel_Type,-Transmission,-Name,-Currency, -Price, -Area) %>% gather() %>%
ggplot(aes(value)) +
geom_boxplot(fill="skyblue", color="black", outlier.colour = "deepskyblue") +
facet_wrap(~key, scales = 'free')+
theme_minimal()
me_cars %>% select(-Driving,-Fuel_Type,-Transmission,-Name,-Currency, -Area, -Price) %>% gather() %>%
ggplot(aes(value)) +
geom_histogram(fill="skyblue", color="black") +
facet_wrap(~key, scales = 'free')+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Now we have obtained a nice dataset to work with.
me_cars %>% select(-Driving,-Fuel_Type,-Transmission,-Name,-Currency,-Price, -Area) %>% cor() %>% corrplot(type = "full",
tl.col = "black", tl.srt = 45)
We see how many variables are quite correlated with each other. This correlation will be better analysed later.
We plot a chart of the correlation between numeric variables and PriceEURO
pr_cor<-me_cars %>% select(-Driving,-Fuel_Type,-Transmission,-Name,-Currency,-Price, -Area) %>% cor() %>% data.frame()
pr_cor<-pr_cor[-15,]
ggplot(pr_cor, aes(x=pr_cor %>% rownames(), y=PriceEURO))+
geom_bar(fill=ifelse(pr_cor$PriceEURO>0,"deepskyblue","red") ,stat="identity")+
labs(x="\nVariables",
y="Correlation with PriceEURO")+
guides(x = guide_axis(angle = 45))+
theme(axis.text = element_text(face="bold"),
panel.background = element_rect(fill = "white",
colour = "white"),
panel.grid.major = element_line(linewidth = 0.01, linetype = 'solid',
colour = "gray88"))
me_cars %>%
ggplot(aes(PriceEURO)) +
geom_density(linewidth=1)+
theme_minimal()
Not normal at all. We try to take the log
me_cars %>%
ggplot(aes(log(PriceEURO))) +
geom_density(linewidth=1)+
stat_overlay_normal_density(color="red", linetype="dashed", linewidth=1)+
theme_minimal()
Much better than before! Let’s try to make statistical tests on this.
To do so we are going to create a new dataset called “mec_stat”, meaning Middle East Cars Stat without some of the useless variable we had in the original dataset.
Then we create a ds for each of the 5 Areas, namely: - Bahrain (BR) - Kuwait (KW) - Oman (OM) - Qatar (QT) - Saudi Arabia (SA) - United Arab Emirates (UAE)
mec_stat<-me_cars %>% select(-Name,-Price, -Currency) #general ds for statistical analysis
#a bit naive but necessary
mec_statSA <- mec_stat %>% filter(Area=="Saudi Arabia") %>% select(-Area)
mec_statUAE <- mec_stat %>% filter(Area=="UAE") %>% select(-Area)
mec_statBR <- mec_stat %>% filter(Area=="Bahrain") %>% select(-Area)
mec_statOM <- mec_stat %>% filter(Area=="Oman") %>% select(-Area)
mec_statQT <- mec_stat %>% filter(Area=="Qatar") %>% select(-Area)
mec_statKW <- mec_stat %>% filter(Area=="Kuwait") %>% select(-Area)
mec_map1 <- data.frame(region=c("Bahrain", "Kuwait", "Qatar", "Oman", "Saudi Arabia", "United Arab Emirates"),
AvgPrice=c(mean((mec_statBR$PriceEURO)),
mean((mec_statKW$PriceEURO)),
mean((mec_statQT$PriceEURO)),
mean((mec_statOM$PriceEURO)),
mean((mec_statSA$PriceEURO)),
mean((mec_statUAE$PriceEURO))))
mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map1, by="region")
mapdata<-mapdata %>% filter(!is.na(mapdata$AvgPrice))
ggplot(mapdata, aes(x=long, y=lat, group=group))+
geom_polygon(aes(fill=AvgPrice), color="black")+
scale_fill_gradient(name="Average car price in EURO\n", low="#CCFFFF", high="#000066",
#ticks for the legend
limit=c(min(mapdata$AvgPrice), max(mapdata$AvgPrice)),
breaks=seq(from=round(min(mapdata$AvgPrice)),
to=max(mapdata$AvgPrice),
by=round((max(mapdata$AvgPrice)-round(min(mapdata$AvgPrice)))/4,2)))+
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
rect = element_blank())
We now take the Shapiro-Wilk test to verify whether the distribution of the variables is normal or not.
cat("Bahrain")
## Bahrain
shapiro.test(log(mec_statBR$PriceEURO))
##
## Shapiro-Wilk normality test
##
## data: log(mec_statBR$PriceEURO)
## W = 0.9905, p-value = 5.992e-06
cat("\nKuwait")
##
## Kuwait
shapiro.test(log(mec_statKW$PriceEURO))
##
## Shapiro-Wilk normality test
##
## data: log(mec_statKW$PriceEURO)
## W = 0.98714, p-value = 3.242e-05
cat("\nQatar")
##
## Qatar
shapiro.test(log(mec_statQT$PriceEURO))
##
## Shapiro-Wilk normality test
##
## data: log(mec_statQT$PriceEURO)
## W = 0.99397, p-value = 0.01684
cat("\nOman")
##
## Oman
shapiro.test(log(mec_statOM$PriceEURO))
##
## Shapiro-Wilk normality test
##
## data: log(mec_statOM$PriceEURO)
## W = 0.99664, p-value = 0.2442
cat("\nSaudi Arabia")
##
## Saudi Arabia
shapiro.test(log(mec_statSA$PriceEURO))
##
## Shapiro-Wilk normality test
##
## data: log(mec_statSA$PriceEURO)
## W = 0.97415, p-value = 1.969e-08
cat("\nUnited Arab Emirates")
##
## United Arab Emirates
shapiro.test(log(mec_statUAE$PriceEURO))
##
## Shapiro-Wilk normality test
##
## data: log(mec_statUAE$PriceEURO)
## W = 0.98677, p-value = 4.05e-06
We can see that, apart from Oman, all the other distributions are not-gaussian. So we check for the distribution over the residuals which, by the way, is the important distribution to check for making inference with ols.
model1BR=mec_statBR %>% lm(log(PriceEURO)~.,.)
cat("\nBahrain")
##
## Bahrain
shapiro.test(model1BR$residuals)
##
## Shapiro-Wilk normality test
##
## data: model1BR$residuals
## W = 0.98647, p-value = 7.518e-08
model1KW=mec_statKW %>% lm(log(PriceEURO)~.,.)
cat("\nKuwait")
##
## Kuwait
shapiro.test(model1KW$residuals)
##
## Shapiro-Wilk normality test
##
## data: model1KW$residuals
## W = 0.99327, p-value = 0.00766
model1OM=mec_statOM %>% lm(log(PriceEURO)~.,.)
cat("\nOman")
##
## Oman
shapiro.test(model1OM$residuals)
##
## Shapiro-Wilk normality test
##
## data: model1OM$residuals
## W = 0.99217, p-value = 0.003011
model1QT=mec_statQT %>% lm(log(PriceEURO)~.,.)
cat("\nQatar")
##
## Qatar
shapiro.test(model1QT$residuals)
##
## Shapiro-Wilk normality test
##
## data: model1QT$residuals
## W = 0.99132, p-value = 0.001321
model1SA=mec_statSA %>% lm(log(PriceEURO)~.,.)
cat("\nSaudi Arabia")
##
## Saudi Arabia
shapiro.test(model1SA$residuals)
##
## Shapiro-Wilk normality test
##
## data: model1SA$residuals
## W = 0.9558, p-value = 5.581e-12
model1UAE=mec_statUAE %>% lm(log(PriceEURO)~.,.)
cat("\nUnited Arab Emirates")
##
## United Arab Emirates
shapiro.test(model1UAE$residuals)
##
## Shapiro-Wilk normality test
##
## data: model1UAE$residuals
## W = 0.99684, p-value = 0.167
We see that Kuwait, Oman and UAE have normally distributed error while the other countries have not. In order to be consistent with the analysis I decided not to consider the linear regression since not all the results where unbiased.
Now we initialize a function for computing the Vif (variance inflation factor) and for plotting its values.
# VIF FUNCTION
vif_func<-function(model_name, country){
modelvif=data.frame(vif(model_name)) #we compute the vif of the model
modelvif$row=row.names(modelvif) #we do some magic to let R know what we are working with
colnames(modelvif)[1] = "Vif"
#Plotting the Vif
modelvif %>%
ggplot(aes(row, Vif))+
geom_bar(stat="identity")+
geom_hline(yintercept = 5, linewidth=1, color="yellow")+
geom_hline(yintercept = 7.5, linewidth=1, color="orange")+
geom_hline(yintercept = 10, linewidth=1, color="red")+
scale_y_continuous(limits = c(0,30), breaks = seq(0,30,2.5), minor_breaks = NULL)+
labs(x=NULL, y=NULL, title = bquote(paste("Vif value per variable - Area: ",.(country))))+
coord_flip()+
theme_minimal()
}
We initialize also a function for obtaining the best subset possible.
#BEST SUBSET MODEL
best_sub<-function(ds,country){
best_sub<-regsubsets(x= log(PriceEURO)~.,data = ds, nvmax = length(names(ds)),
method = "forward") #we do the subset
best_sub_summary<- best_sub %>% summary()
best_sub_summary$cp %>% which.min()
plot(best_sub, scale="Cp") #we plot the best one
title(main=country)
}
We do again the regression for convenience.
model1BR=mec_statBR %>% lm(log(PriceEURO)~.,.,)
model1KW=mec_statKW %>% lm(log(PriceEURO)~.,.)
model1QT=mec_statQT %>% lm(log(PriceEURO)~.,.)
model1OM=mec_statOM %>% lm(log(PriceEURO)~.,.)
model1SA=mec_statSA %>% lm(log(PriceEURO)~.,.)
model1UAE=mec_statUAE %>% lm(log(PriceEURO)~.,.)
First we check the Vif for all the models and…
vif_func(model1BR,"Bahrain")
vif_func(model1KW,"Kuwait")
vif_func(model1QT,"Qatar")
vif_func(model1OM,"Oman")
vif_func(model1SA,"Saudi Arabia")
vif_func(model1UAE,"United Arab Emirates")
… we eliminate the most correlated variables:
mec_statBR <- mec_statBR %>% select(-Horsepower, -Engine_Capacity)
mec_statKW <- mec_statKW %>% select(-Horsepower, -Engine_Capacity)
mec_statQT <- mec_statQT %>% select(-Horsepower, -Engine_Capacity)
mec_statSA <- mec_statSA %>% select(-Horsepower, -Engine_Capacity)
mec_statOM <- mec_statOM %>% select(-Horsepower, -Engine_Capacity)
mec_statUAE <- mec_statUAE %>% select(-Length, -Horsepower,-Engine_Capacity)
Now, with the leftover variables, we compute the best possible subset for each model and…
mec_statBR %>% best_sub("Bahrain")
mec_statKW %>% best_sub("Kuwait")
mec_statQT %>% best_sub("Qatar")
mec_statOM %>% best_sub("Oman")
mec_statSA %>% best_sub("Saudi Arabia")
mec_statUAE %>% best_sub("UAE")
… again we remove the non-important ones and create some dummies if necessary. To be clear, the non-important variables are the ones that at the top of the chart have a white rectangle.
#Selecting only the important variables
#BAHRAIN
mec_statBR <- mec_statBR %>% select(-Transmission)
#KUWAIT
mec_statKW <- mec_statKW %>% select(-c(Acceleration_0100, Length))
#QATAR
mec_statQT$TransMan <-ifelse(mec_statQT$Transmission=="Manual",1,0)
mec_statQT <- mec_statQT %>% select(-Transmission, -Fuel_Type, -Length)
#OMAN
mec_statOM$FT_Hybrid <-ifelse(mec_statOM$Fuel_Type=="Hybrid",0,1)
mec_statOM <- mec_statOM %>% select(-Length, -Transmission, -Fuel_Type)
#Saudi Arabia
mec_statSA$DrivingFWD <-ifelse(mec_statSA$Driving=="Front Wheel Drive",1,0)
mec_statSA <- mec_statSA %>% select(-Driving, -Transmission, Width)
#United Arab Emirates
mec_statUAE$DrivingFWD <-ifelse(mec_statUAE$Driving=="Front Wheel Drive",1,0)
mec_statUAE$FT_Hybrid <-ifelse(mec_statUAE$Fuel_Type=="Hybrid",1,0)
mec_statUAE <- mec_statUAE %>% select(-Driving,-Fuel_Type,-Transmission)
We now retrain the linear model with the new datasets
model1BR=mec_statBR %>% lm(log(PriceEURO)~.,.,)
model1KW=mec_statKW %>% lm(log(PriceEURO)~.,.)
model1OM=mec_statOM %>% lm(log(PriceEURO)~.,.)
model1QT=mec_statQT %>% lm(log(PriceEURO)~.,.)
model1SA=mec_statSA %>% lm(log(PriceEURO)~.,.)
model1UAE=mec_statUAE %>% lm(log(PriceEURO)~.,.)
Check again the Vif and notice that things got way better.
vif_func(model1BR,"Bahrain")
vif_func(model1KW,"Kuwait")
vif_func(model1OM,"Oman")
vif_func(model1QT,"Qatar")
vif_func(model1SA,"Saudi Arabia")
vif_func(model1UAE,"UAE")
Now we have cleaned the dataset as much as possible. We don’t have variables that suffer from multicollinearity and we have selected the best subset of variables, this seems perfect!!
What about using the robust regression since…
ols_plot_resid_lev(model1KW)
… there are so many outliers?
Initializing a function to compute the robust regression. We are not scaling the variable PriceEURO and we are removing, if present, the variable DrivingFWD because it was giving problems.
rob_reg<-function(ds){
set.seed(1)
index <- sample(2, nrow(ds), prob = c(0.8, 0.2), replace = TRUE)
#so we give to each observation a prob of 80% of falling into the train and 20% of falling into the test
PriceEURO<-ds$PriceEURO
ds<-ds %>% select(where(is.numeric), -PriceEURO) %>% scale() %>% data.frame() %>% cbind(PriceEURO)
Train <- ds[index==1, ] # Train data
Test <- ds[index == 2, ] #Test data
model1<- Train %>% lmrob(log(PriceEURO)~.,.)
predicted<-predict(model1, newdata=Test)
cat("\nR²: ",
1-(
(sum((log(Test$PriceEURO)-predicted)^2))/
(sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))),"\n")
model1 %>% summary() %>% print()
model1$coefficients %>% sort(decreasing = TRUE)
}
From the following cell it is possible to retrieve the most influent variables in the model by looking among the significant ones at those with the highest coefficient in absolute value.
cat("BAHRAIN:")
## BAHRAIN:
mec_statBR %>% rob_reg()
##
## R²: 0.8886621
##
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
## \--> method = "MM"
## Residuals:
## Min 1Q Median 3Q Max
## -1.3266935 -0.1435016 0.0006825 0.1543759 0.9993950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.75787 0.00919 1170.563 < 2e-16 ***
## Cylinders 0.12830 0.03256 3.940 8.89e-05 ***
## Fuel_Capacity 0.16452 0.02103 7.824 1.69e-14 ***
## Liters_For_100km -0.06353 0.02377 -2.672 0.00769 **
## Torque 0.19791 0.04304 4.598 4.98e-06 ***
## Top_Speed 0.20916 0.02949 7.093 2.98e-12 ***
## Seating_Capacity -0.10199 0.01979 -5.155 3.23e-07 ***
## Acceleration_0100 -0.15107 0.02553 -5.916 4.95e-09 ***
## Length -0.04884 0.03079 -1.586 0.11313
## Width 0.05286 0.01205 4.389 1.30e-05 ***
## Height 0.05734 0.02375 2.414 0.01601 *
## Wheelbase 0.10379 0.03156 3.289 0.00105 **
## Trunk_Capacity -0.02819 0.01204 -2.342 0.01946 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 0.2282
## Multiple R-squared: 0.91, Adjusted R-squared: 0.9086
## Convergence in 15 IRWLS iterations
##
## Robustness weights:
## 2 observations c(310,476) are outliers with |weight| = 0 ( < 0.00013);
## 75 weights are ~= 1. The remaining 705 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01118 0.86620 0.94990 0.89130 0.98420 0.99900
## Algorithmic parameters:
## tuning.chi bb tuning.psi refine.tol
## 1.548e+00 5.000e-01 4.685e+00 1.000e-07
## rel.tol scale.tol solve.tol zero.tol
## 1.000e-07 1.000e-10 1.000e-07 1.000e-10
## eps.outlier eps.x warn.limit.reject warn.limit.meanrw
## 1.279e-04 8.041e-12 5.000e-01 5.000e-01
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd fast.s.large.n
## 200 0 1000 0 2000
## psi subsampling cov
## "bisquare" "nonsingular" ".vcov.avar1"
## compute.outlier.stats
## "SM"
## seed : int(0)
## (Intercept) Top_Speed Torque Fuel_Capacity
## 10.75787271 0.20915652 0.19791278 0.16451978
## Cylinders Wheelbase Height Width
## 0.12830315 0.10379297 0.05733867 0.05286172
## Trunk_Capacity Length Liters_For_100km Seating_Capacity
## -0.02819339 -0.04884005 -0.06352557 -0.10199139
## Acceleration_0100
## -0.15106847
cat("\n\nKUWAIT:")
##
##
## KUWAIT:
mec_statKW %>% rob_reg()
##
## R²: 0.8531936
##
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
## \--> method = "MM"
## Residuals:
## Min 1Q Median 3Q Max
## -0.836638 -0.166968 0.006574 0.162523 0.941895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.728674 0.012552 854.770 < 2e-16 ***
## Cylinders 0.044608 0.026862 1.661 0.097436 .
## Fuel_Capacity 0.105613 0.025441 4.151 3.91e-05 ***
## Liters_For_100km -0.008757 0.025763 -0.340 0.734077
## Torque 0.304085 0.031588 9.627 < 2e-16 ***
## Top_Speed 0.356867 0.027357 13.045 < 2e-16 ***
## Seating_Capacity -0.088859 0.021868 -4.063 5.65e-05 ***
## Width 0.054069 0.014514 3.725 0.000218 ***
## Height 0.092683 0.024166 3.835 0.000142 ***
## Wheelbase 0.065010 0.021467 3.028 0.002592 **
## Trunk_Capacity -0.039214 0.014135 -2.774 0.005750 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 0.236
## Multiple R-squared: 0.9022, Adjusted R-squared: 0.9002
## Convergence in 14 IRWLS iterations
##
## Robustness weights:
## 36 weights are ~= 1. The remaining 454 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.07512 0.85280 0.94990 0.89160 0.98610 0.99900
## Algorithmic parameters:
## tuning.chi bb tuning.psi refine.tol
## 1.548e+00 5.000e-01 4.685e+00 1.000e-07
## rel.tol scale.tol solve.tol zero.tol
## 1.000e-07 1.000e-10 1.000e-07 1.000e-10
## eps.outlier eps.x warn.limit.reject warn.limit.meanrw
## 2.041e-04 7.438e-12 5.000e-01 5.000e-01
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd fast.s.large.n
## 200 0 1000 0 2000
## psi subsampling cov
## "bisquare" "nonsingular" ".vcov.avar1"
## compute.outlier.stats
## "SM"
## seed : int(0)
## (Intercept) Top_Speed Torque Fuel_Capacity
## 10.728674391 0.356867458 0.304084544 0.105613081
## Height Wheelbase Width Cylinders
## 0.092682928 0.065009742 0.054069309 0.044608112
## Liters_For_100km Trunk_Capacity Seating_Capacity
## -0.008756928 -0.039214365 -0.088858677
cat("\n\nOMAN:")
##
##
## OMAN:
mec_statOM %>% rob_reg()
##
## R²: 0.8713075
##
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
## \--> method = "MM"
## Residuals:
## Min 1Q Median 3Q Max
## -0.986764 -0.141222 -0.008141 0.145942 0.694959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.88905 0.01106 984.820 < 2e-16 ***
## Cylinders 0.12628 0.02672 4.727 3.02e-06 ***
## Fuel_Capacity 0.07831 0.02242 3.493 0.000522 ***
## Liters_For_100km -0.02534 0.02835 -0.894 0.371951
## Torque 0.20533 0.04073 5.041 6.62e-07 ***
## Top_Speed 0.20531 0.02657 7.727 6.70e-14 ***
## Seating_Capacity -0.12736 0.02316 -5.499 6.29e-08 ***
## Acceleration_0100 -0.13522 0.02691 -5.025 7.16e-07 ***
## Width 0.04335 0.01222 3.546 0.000430 ***
## Height 0.10878 0.02146 5.069 5.76e-07 ***
## Wheelbase 0.11992 0.02447 4.901 1.31e-06 ***
## Trunk_Capacity -0.06123 0.01572 -3.896 0.000112 ***
## FT_Hybrid -0.03505 0.02227 -1.574 0.116159
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 0.2102
## Multiple R-squared: 0.9207, Adjusted R-squared: 0.9187
## Convergence in 17 IRWLS iterations
##
## Robustness weights:
## observation 177 is an outlier with |weight| = 0 ( < 0.00021);
## 42 weights are ~= 1. The remaining 441 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01013 0.86590 0.94780 0.88690 0.98470 0.99900
## Algorithmic parameters:
## tuning.chi bb tuning.psi refine.tol
## 1.548e+00 5.000e-01 4.685e+00 1.000e-07
## rel.tol scale.tol solve.tol zero.tol
## 1.000e-07 1.000e-10 1.000e-07 1.000e-10
## eps.outlier eps.x warn.limit.reject warn.limit.meanrw
## 2.066e-04 1.033e-11 5.000e-01 5.000e-01
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd fast.s.large.n
## 200 0 1000 0 2000
## psi subsampling cov
## "bisquare" "nonsingular" ".vcov.avar1"
## compute.outlier.stats
## "SM"
## seed : int(0)
## (Intercept) Torque Top_Speed Cylinders
## 10.88905288 0.20533215 0.20530627 0.12627742
## Wheelbase Height Fuel_Capacity Width
## 0.11991893 0.10878440 0.07830963 0.04334625
## Liters_For_100km FT_Hybrid Trunk_Capacity Seating_Capacity
## -0.02533854 -0.03504849 -0.06122890 -0.12735750
## Acceleration_0100
## -0.13521684
cat("\n\nQatar:")
##
##
## Qatar:
mec_statQT %>% rob_reg()
##
## R²: 0.8709802
##
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
## \--> method = "MM"
## Residuals:
## Min 1Q Median 3Q Max
## -0.9661351 -0.1550684 0.0003846 0.1459891 0.7342822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.84969 0.01159 935.754 < 2e-16 ***
## Cylinders 0.06471 0.02883 2.244 0.025268 *
## Fuel_Capacity 0.10305 0.02605 3.955 8.82e-05 ***
## Liters_For_100km -0.02533 0.03858 -0.657 0.511792
## Torque 0.25658 0.03864 6.641 8.59e-11 ***
## Top_Speed 0.24346 0.03410 7.139 3.58e-12 ***
## Seating_Capacity -0.10463 0.02880 -3.633 0.000311 ***
## Acceleration_0100 -0.11464 0.03233 -3.546 0.000430 ***
## Width 0.04319 0.01461 2.957 0.003264 **
## Height 0.06940 0.02405 2.886 0.004081 **
## Wheelbase 0.11998 0.02649 4.529 7.50e-06 ***
## Trunk_Capacity -0.03779 0.01861 -2.031 0.042812 *
## TransMan -0.02446 0.01636 -1.494 0.135727
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 0.2125
## Multiple R-squared: 0.9184, Adjusted R-squared: 0.9164
## Convergence in 17 IRWLS iterations
##
## Robustness weights:
## 45 weights are ~= 1. The remaining 440 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.003354 0.843000 0.943100 0.879200 0.984300 0.999000
## Algorithmic parameters:
## tuning.chi bb tuning.psi refine.tol
## 1.548e+00 5.000e-01 4.685e+00 1.000e-07
## rel.tol scale.tol solve.tol zero.tol
## 1.000e-07 1.000e-10 1.000e-07 1.000e-10
## eps.outlier eps.x warn.limit.reject warn.limit.meanrw
## 2.062e-04 9.127e-12 5.000e-01 5.000e-01
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd fast.s.large.n
## 200 0 1000 0 2000
## psi subsampling cov
## "bisquare" "nonsingular" ".vcov.avar1"
## compute.outlier.stats
## "SM"
## seed : int(0)
## (Intercept) Torque Top_Speed Wheelbase
## 10.84968805 0.25658402 0.24346222 0.11997904
## Fuel_Capacity Height Cylinders Width
## 0.10304928 0.06940083 0.06471392 0.04318722
## TransMan Liters_For_100km Trunk_Capacity Seating_Capacity
## -0.02445611 -0.02533065 -0.03778802 -0.10463409
## Acceleration_0100
## -0.11463592
cat("\n\nSAUDI ARABIA")
##
##
## SAUDI ARABIA
mec_statSA %>% rob_reg()
##
## R²: 0.8458653
##
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
## \--> method = "MM"
## Residuals:
## Min 1Q Median 3Q Max
## -0.94595 -0.15643 -0.00349 0.15680 2.40420
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.84434 0.01779 609.513 < 2e-16 ***
## Cylinders 0.06789 0.04890 1.388 0.165764
## Fuel_Capacity 0.11017 0.03579 3.078 0.002211 **
## Liters_For_100km -0.04877 0.03256 -1.498 0.134882
## Torque 0.25097 0.04716 5.322 1.64e-07 ***
## Top_Speed 0.26074 0.03752 6.949 1.32e-11 ***
## Seating_Capacity -0.11413 0.02819 -4.048 6.09e-05 ***
## Acceleration_0100 -0.12793 0.02940 -4.352 1.68e-05 ***
## Length -0.07267 0.03654 -1.989 0.047329 *
## Width 0.03083 0.01993 1.547 0.122537
## Height 0.09445 0.02551 3.703 0.000240 ***
## Wheelbase 0.13202 0.04306 3.066 0.002302 **
## Trunk_Capacity -0.02472 0.01550 -1.595 0.111397
## DrivingFWD -0.06886 0.01770 -3.891 0.000115 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 0.2317
## Multiple R-squared: 0.9047, Adjusted R-squared: 0.9019
## Convergence in 24 IRWLS iterations
##
## Robustness weights:
## observation 21 is an outlier with |weight| = 0 ( < 0.00022);
## 39 weights are ~= 1. The remaining 415 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04663 0.86680 0.94850 0.88530 0.98380 0.99900
## Algorithmic parameters:
## tuning.chi bb tuning.psi refine.tol
## 1.548e+00 5.000e-01 4.685e+00 1.000e-07
## rel.tol scale.tol solve.tol zero.tol
## 1.000e-07 1.000e-10 1.000e-07 1.000e-10
## eps.outlier eps.x warn.limit.reject warn.limit.meanrw
## 2.198e-04 6.690e-12 5.000e-01 5.000e-01
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd fast.s.large.n
## 200 0 1000 0 2000
## psi subsampling cov
## "bisquare" "nonsingular" ".vcov.avar1"
## compute.outlier.stats
## "SM"
## seed : int(0)
## (Intercept) Top_Speed Torque Wheelbase
## 10.84434211 0.26073533 0.25097195 0.13201844
## Fuel_Capacity Height Cylinders Width
## 0.11016927 0.09445222 0.06789084 0.03083190
## Trunk_Capacity Liters_For_100km DrivingFWD Length
## -0.02471781 -0.04877284 -0.06885610 -0.07267291
## Seating_Capacity Acceleration_0100
## -0.11412580 -0.12793431
cat("\n\nUAE:")
##
##
## UAE:
mec_statUAE %>% rob_reg()
##
## R²: 0.8914368
##
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
## \--> method = "MM"
## Residuals:
## Min 1Q Median 3Q Max
## -0.953682 -0.168612 0.006339 0.163670 0.796055
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.76691 0.01295 831.355 < 2e-16 ***
## Cylinders 0.14844 0.03677 4.037 6.17e-05 ***
## Fuel_Capacity 0.08232 0.03366 2.446 0.014765 *
## Liters_For_100km -0.05864 0.03819 -1.535 0.125258
## Torque 0.22845 0.04836 4.724 2.91e-06 ***
## Top_Speed 0.21031 0.02788 7.544 1.82e-13 ***
## Seating_Capacity -0.07130 0.02204 -3.235 0.001286 **
## Acceleration_0100 -0.11047 0.02824 -3.911 0.000103 ***
## Width 0.07893 0.01660 4.754 2.53e-06 ***
## Height 0.04918 0.02158 2.279 0.023015 *
## Wheelbase 0.05888 0.02324 2.534 0.011559 *
## Trunk_Capacity -0.03038 0.01670 -1.820 0.069313 .
## DrivingFWD -0.10175 0.01532 -6.641 7.31e-11 ***
## FT_Hybrid 0.02419 0.02077 1.165 0.244633
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 0.2379
## Multiple R-squared: 0.9055, Adjusted R-squared: 0.9033
## Convergence in 20 IRWLS iterations
##
## Robustness weights:
## 46 weights are ~= 1. The remaining 536 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.07166 0.83870 0.94750 0.88390 0.98450 0.99900
## Algorithmic parameters:
## tuning.chi bb tuning.psi refine.tol
## 1.548e+00 5.000e-01 4.685e+00 1.000e-07
## rel.tol scale.tol solve.tol zero.tol
## 1.000e-07 1.000e-10 1.000e-07 1.000e-10
## eps.outlier eps.x warn.limit.reject warn.limit.meanrw
## 1.718e-04 1.026e-11 5.000e-01 5.000e-01
## nResample max.it best.r.s k.fast.s k.max
## 500 50 2 1 200
## maxit.scale trace.lev mts compute.rd fast.s.large.n
## 200 0 1000 0 2000
## psi subsampling cov
## "bisquare" "nonsingular" ".vcov.avar1"
## compute.outlier.stats
## "SM"
## seed : int(0)
## (Intercept) Torque Top_Speed Cylinders
## 10.76690504 0.22844869 0.21030951 0.14843925
## Fuel_Capacity Width Wheelbase Height
## 0.08231884 0.07893028 0.05887695 0.04918252
## FT_Hybrid Trunk_Capacity Liters_For_100km Seating_Capacity
## 0.02418564 -0.03038375 -0.05864114 -0.07129609
## DrivingFWD Acceleration_0100
## -0.10174564 -0.11047159
#Storing the value of the R2 of the robust regression
R2rr<-c(0.8886621,
0.8531936,
0.8713075,
0.8709802,
0.8458653,
0.8914368)
mec_map5 <- data.frame(region=c("Bahrain", "Kuwait", "Oman", "Qatar", "Saudi Arabia", "United Arab Emirates"),
R2rr=R2rr)
mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map5, by="region")
mapdata<-mapdata %>% filter(!is.na(mapdata$R2rr))
ggplot(mapdata, aes(x=long, y=lat, group=group))+
geom_polygon(aes(fill=R2rr), color="black")+
scale_fill_gradient(name="R² for Robust Regression\n", low="#CCFFFF", high="#000066",
limit=c(.84, .90),
breaks=seq(from=.84,
to=.90,
by=.01))+
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
rect = element_blank())
Since neither the linear nor the robust linear model were good, for a reason or another, we could try with regression trees.
— Notice that we are doing first a single regression tree before doing the random forest
tree_constructor <- function(ds) { #this function works only with rpart objects
set.seed(1)
index <- sample(2, nrow(ds), prob = c(0.8, 0.2), replace = TRUE) #so we give to each observation a prob of 80% of falling into the train and 20% of falling into the test
Train <- ds[index==1, ] # Train data
Test <- ds[index == 2, ] #Test data
model1<- Train %>% rpart(log(PriceEURO)~.,., method="anova", model = TRUE)
#we use anova because we are doing regression
cp <- which.min(model1$cptable[, "xerror"]) %>% model1$cptable[., "CP"]
#we select the min value of the column xerror of the cptable (where cp stands for the cost of pruning )
model1_pr <- prune(tree = model1, cp =cp)
model1_pr %>% rpart.plot(roundint = TRUE, digits = 4, left=FALSE)
predicted<-predict(model1_pr, newdata=Test)
cat("\nThe MSE is:",mean(predicted-log(Test$PriceEURO))^2,"\n")
cat("\nR²: ",
1-(
(sum((log(Test$PriceEURO)-predicted)^2))/
(sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))),"\n")
print(data.frame(model1_pr$variable.importance))
return(1-(
(sum((log(Test$PriceEURO)-predicted)^2))/
(sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))))
}
We can now see the best single tree for each country
R2Tree<- c(tree_constructor(mec_statBR), #create a variable R2Tree for the Rsquared
tree_constructor(mec_statKW),
tree_constructor(mec_statOM),
tree_constructor(mec_statQT),
tree_constructor(mec_statSA),
tree_constructor(mec_statUAE))
##
## The MSE is: 2.397884e-05
##
## R²: 0.8180443
## model1_pr.variable.importance
## Torque 385.544836
## Acceleration_0100 270.107073
## Top_Speed 236.847364
## Cylinders 199.457032
## Fuel_Capacity 140.951901
## Driving 105.660058
## Width 33.797930
## Length 21.015265
## Height 9.387311
## Liters_For_100km 6.879841
## Seating_Capacity 6.484826
## Wheelbase 6.391267
## Trunk_Capacity 3.175046
##
## The MSE is: 5.513108e-05
##
## R²: 0.7998163
## model1_pr.variable.importance
## Torque 241.759365
## Top_Speed 144.147067
## Cylinders 110.595604
## Wheelbase 110.555304
## Fuel_Capacity 104.176723
## Width 80.915688
## Driving 18.318286
## Liters_For_100km 5.865978
## Height 5.226239
## Trunk_Capacity 4.562545
## Seating_Capacity 1.808048
## Fuel_Type 1.181117
##
## The MSE is: 0.0004196496
##
## R²: 0.835828
## model1_pr.variable.importance
## Torque 244.5673703
## Acceleration_0100 163.8823343
## Top_Speed 136.7798951
## Cylinders 127.2608099
## Wheelbase 107.0998685
## Fuel_Capacity 105.7481582
## Driving 20.2455195
## Width 7.6473476
## Liters_For_100km 3.1402231
## Height 0.8976608
## Trunk_Capacity 0.8976608
##
## The MSE is: 0.001506967
##
## R²: 0.8542003
## model1_pr.variable.importance
## Torque 246.4207359
## Acceleration_0100 183.6716560
## Top_Speed 160.5372171
## Cylinders 132.1265966
## Wheelbase 86.0343215
## Liters_For_100km 84.5581803
## Width 16.3161102
## Fuel_Capacity 11.7870555
## Driving 10.1884367
## Height 4.6239124
## Trunk_Capacity 2.2538783
## Seating_Capacity 0.1224919
##
## The MSE is: 0.0003354466
##
## R²: 0.7114432
## model1_pr.variable.importance
## Torque 235.438646
## Acceleration_0100 146.160622
## Top_Speed 118.817794
## Cylinders 93.715447
## Wheelbase 80.757720
## Width 67.227059
## Fuel_Capacity 21.263345
## DrivingFWD 15.013701
## Length 8.855546
## Liters_For_100km 5.238668
## Trunk_Capacity 4.427914
## Fuel_Type 2.249831
## Height 2.225605
## Seating_Capacity 1.907661
##
## The MSE is: 0.001250991
##
## R²: 0.8046047
## model1_pr.variable.importance
## Torque 321.645929
## Acceleration_0100 211.774480
## Top_Speed 186.399227
## Cylinders 172.916515
## Fuel_Capacity 142.437278
## Width 130.365838
## Wheelbase 14.010734
## Liters_For_100km 7.829249
## Height 6.972115
## Trunk_Capacity 5.352463
mec_map2 <- data.frame(region=c("Bahrain", "Kuwait", "Oman", "Qatar", "Saudi Arabia", "United Arab Emirates"),
R2Tree=R2Tree)
mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map2, by="region")
mapdata<-mapdata %>% filter(!is.na(mapdata$R2Tree))
ggplot(mapdata, aes(x=long, y=lat, group=group))+
geom_polygon(aes(fill=R2Tree), color="black")+
scale_fill_gradient(name="R² for Regression Trees\n", low="#CCFFFF", high="#000066",
limit=c(.70, .90),
breaks=seq(from=.70,
to=.90,
by=.04))+
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
rect = element_blank())
Not bad but this is the worst model obtained so far.
rf_model <- function(ds, country) {
set.seed(1)
index <- sample(2, nrow(ds), prob = c(0.8, 0.2), replace = TRUE) #so we give to each observation a prob of 80% of falling into the train and 20% of falling into the test
Train <- ds[index==1, ] # Train data
Test <- ds[index==2, ]
##### BEST MTRY #####
# Define the cross validation
control <- trainControl(method = "cv",
number = 10, #number of folds
search = "random")
grid <- data.frame(mtry = c(2, 3, 4)) #
# train the model
best_mtry_model <- train(log(PriceEURO) ~ .,
data = Train,
method = "rf",
trControl = control,
tuneGrid=grid)
cat("\nThe best nr of mtry for min error is: ", best_mtry_model$bestTune[1,1])
##### BEST NTREE #####
tree_nr <- seq(from= 50, to= 500, by=50) # number of trees to test
MSE_train<-c()
for (i in 1:length(tree_nr)) {
model_rf <- randomForest(log(PriceEURO) ~ ., data = Train, ntree = tree_nr[i],
mtry=best_mtry_model$bestTune[1,1])
MSE_train<-append(MSE_train, model_rf$mse[i]) #append the mse to the vector
}
# MSE error minimizer
best_ntree <- min(MSE_train[MSE_train<0.09 & MSE_train>0.04]) %>% match(MSE_train) %>% tree_nr[.] #best_ntree is the number such that the MSE is within 0.05 and 0.09 to avoid overfitting. These values have been found thanks to empirical tests
cat("\nThe best nr of trees for min error is: ", best_ntree)
rf1=randomForest(log(PriceEURO)~.,
data=Train,
ntree=best_ntree,
mtry=best_mtry_model$bestTune[1,1],
importance=TRUE)
print(rf1)
plot(rf1, main=country)
varImpPlot(rf1, main=country)
yhat <- predict(rf1,newdata = Test)
plot (yhat , Test$PriceEURO %>% log(), xlab="Predicted ln(Price)",ylab = "Actual ln(Price)", main=country)
abline (0, 1, col="red")
cat("Mean squared error: ",mean(yhat - log(Test$PriceEURO))^2)
cat("\nR²: ",
1-(
(sum((log(Test$PriceEURO)-yhat)^2))/
(sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))))
}
#NEW
rf_model(mec_statBR, "Bahrain")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 500
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.03080628
## % Var explained: 94.96
## Mean squared error: 5.706505e-05
## R²: 0.9461617
#NEW
rf_model(mec_statKW, "Kuwait")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 500
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.03652742
## % Var explained: 93.99
## Mean squared error: 0.0004108964
## R²: 0.926028
#NEW
rf_model(mec_statOM, "Oman")
##
## The best nr of mtry for min error is: 3
## The best nr of trees for min error is: 500
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.03564172
## % Var explained: 94.13
## Mean squared error: 0.0004943466
## R²: 0.9390241
#NEW
rf_model(mec_statQT, "Qatar")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 350
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 350
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.0354777
## % Var explained: 94.27
## Mean squared error: 0.0002218077
## R²: 0.9436625
#NEW
rf_model(mec_statSA, "Saudi Arabia")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 500
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.05466865
## % Var explained: 91.56
## Mean squared error: 0.0002745637
## R²: 0.9077597
#NEW
rf_model(mec_statUAE, "United Arab Emirates")
##
## The best nr of mtry for min error is: 3
## The best nr of trees for min error is: 500
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.04145605
## % Var explained: 93.89
## Mean squared error: 0.0001357007
## R²: 0.9362491
R2rf<-c(0.9458579, 0.9260215,0.9434955, 0.9389488, 0.9077597, 0.9359294)
mec_map3 <- data.frame(region=c("Bahrain", "Kuwait", "Qatar", "Oman", "Saudi Arabia", "United Arab Emirates"),
R2rf=R2rf)
mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map3, by="region")
mapdata<-mapdata %>% filter(!is.na(mapdata$R2rf))
ggplot(mapdata, aes(x=long, y=lat, group=group))+
geom_polygon(aes(fill=R2rf), color="black")+
scale_fill_gradient(name="R² for Random Forest\n", low="#CCFFFF", high="#000066",
limit=c(.90, .95),
breaks=seq(from=.90,
to=.95,
by=.01))+
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
rect = element_blank())
The random forest is clearly the best model even though it is a bit slow.
R2df<-left_join(mec_map2,mec_map3, by="region")
R2df$R2rr<-R2rr
R2best_Tr_rr<-c()
for (i in (1:6)){
if (R2df[i,2]>R2df[i,4]){
R2best_Tr_rr[i]<-"Regression Tree"
} else if (R2df[i,2]==R2df[i,4]) {
R2best_Tr_rr[i]<-"Tie"
}else {
R2best_Tr_rr[i]<-"Robust Regression"
}
}
R2df$R2best_Tr_rr<-R2best_Tr_rr
R2df$R2best_Tr_rr %<>% as.factor
R2df
## region R2Tree R2rf R2rr R2best_Tr_rr
## 1 Bahrain 0.8180443 0.9458579 0.8886621 Robust Regression
## 2 Kuwait 0.7998163 0.9260215 0.8531936 Robust Regression
## 3 Oman 0.8358280 0.9389488 0.8713075 Robust Regression
## 4 Qatar 0.8542003 0.9434955 0.8709802 Robust Regression
## 5 Saudi Arabia 0.7114432 0.9077597 0.8458653 Robust Regression
## 6 United Arab Emirates 0.8046047 0.9359294 0.8914368 Robust Regression
In this map we are comparing R2 to see which is the highest
mapdata<-map_data("world")
mapdata <- left_join(mapdata, R2df, by="region")
mapdata<-mapdata %>% filter(!is.na(mapdata$R2best_Tr_rr))
ggplot(mapdata, aes(x=long, y=lat, group=group))+
geom_polygon(aes(fill=R2best_Tr_rr), color="white")+
scale_fill_manual(name="Best model according to R²:\n",labels=c("Robust Regression", "Regression Tree","Tie"),
values=c( "#000066", "#CCFFFF", "lightgrey"))+
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
rect = element_blank())
For this section I decided to use the PCA. Since the PCA works better with correlated variables, I decided to restore the original datasets, excluding just the variable Area
mec_statSA2 <- mec_stat %>% filter(Area=="Saudi Arabia") %>% select(-Area)
mec_statUAE2 <- mec_stat %>% filter(Area=="UAE") %>% select(-Area)
mec_statBR2 <- mec_stat %>% filter(Area=="Bahrain") %>% select(-Area)
mec_statQT2 <- mec_stat %>% filter(Area=="Qatar") %>% select(-Area)
mec_statOM2 <- mec_stat %>% filter(Area=="Oman") %>% select(-Area)
mec_statKW2 <- mec_stat %>% filter(Area=="Kuwait") %>% select(-Area)
mec_statSA2 %>% glimpse() #for example
## Rows: 565
## Columns: 18
## $ Engine_Capacity <dbl> 1.2, 1.6, 1.5, 1.4, 1.6, 1.4, 1.4, 1.5, 1.6, 1.4, 1.…
## $ Cylinders <dbl> 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4…
## $ Driving <fct> Front Wheel Drive, Front Wheel Drive, Front Wheel Dr…
## $ Fuel_Capacity <dbl> 42, 50, 48, 35, 50, 45, 45, 48, 45, 35, 60, 35, 48, …
## $ Liters_For_100km <dbl> 4.9, 6.4, 5.8, 5.1, 6.4, 6.3, 6.3, 5.8, 6.4, 5.1, 7.…
## $ Fuel_Type <fct> Petrol, Petrol, Petrol, Petrol, Petrol, Petrol, Petr…
## $ Horsepower <int> 76, 102, 112, 98, 102, 100, 100, 112, 123, 98, 130, …
## $ Torque <dbl> 100, 145, 150, 127, 145, 132, 132, 150, 151, 127, 17…
## $ Transmission <fct> Automatic, Automatic, Automatic, Automatic, Automati…
## $ Top_Speed <dbl> 170, 180, 170, 170, 180, 183, 183, 170, 193, 170, 18…
## $ Seating_Capacity <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5…
## $ Acceleration_0100 <dbl> 14.0, 11.0, 10.9, 12.0, 11.0, 13.4, 13.4, 10.9, 11.3…
## $ Length <dbl> 4.24, 4.35, 4.31, 3.64, 4.35, 4.44, 4.44, 4.31, 4.44…
## $ Width <dbl> 1.67, 1.99, 1.81, 1.60, 1.99, 1.73, 1.73, 1.81, 1.73…
## $ Height <dbl> 1.51, 1.53, 1.62, 1.48, 1.53, 1.48, 1.48, 1.62, 1.48…
## $ Wheelbase <dbl> 2.55, 2.63, 2.58, 2.38, 2.63, 2.60, 2.60, 2.58, 2.60…
## $ Trunk_Capacity <dbl> 450, 510, 448, 314, 510, 396, 396, 448, 396, 314, 33…
## $ PriceEURO <dbl> 8524.75, 11232.50, 14446.75, 13447.50, 13695.00, 133…
Some data cleaning not to have categorical variables
#BAHRAIN
mec_statBR2$DrivingFWD <- ifelse(mec_statBR2$Driving=="Front Wheel Drive", 1,0)
mec_statBR2$DrivingAWD <- ifelse(mec_statBR2$Driving=="All Wheel Drive", 1,0)
mec_statBR2$DrivingRWD <- ifelse(mec_statBR2$Driving=="Rear Wheel Drive", 1,0)
mec_statBR2$TransMan <- ifelse(mec_statBR2$Transmission=="Manual", 1,0)
mec_statBR2$TransCVT <- ifelse(mec_statBR2$Transmission=="CVT", 1,0)
mec_statBR2$TransAuto <- ifelse(mec_statBR2$Transmission=="Automatic", 1,0)
mec_statBR2$FTHybr <- ifelse(mec_statBR2$Fuel_Type=="Hybrid", 1,0)
mec_statBR2$FTPetr <- ifelse(mec_statBR2$Fuel_Type=="Petrol", 1,0)
mec_statBR2$FTDiesel <- ifelse(mec_statBR2$Fuel_Type=="Diesel", 1,0)
mec_statBR2 <- mec_statBR2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))
#KUWAIT
mec_statKW2$DrivingFWD <- ifelse(mec_statKW2$Driving=="Front Wheel Drive", 1,0)
mec_statKW2$DrivingAWD <- ifelse(mec_statKW2$Driving=="All Wheel Drive", 1,0)
mec_statKW2$DrivingRWD <- ifelse(mec_statKW2$Driving=="Rear Wheel Drive", 1,0)
mec_statKW2$TransMan <- ifelse(mec_statKW2$Transmission=="Manual", 1,0)
mec_statKW2$TransCVT <- ifelse(mec_statKW2$Transmission=="CVT", 1,0)
mec_statKW2$TransAuto <- ifelse(mec_statKW2$Transmission=="Automatic", 1,0)
mec_statKW2$FTHybr <- ifelse(mec_statKW2$Fuel_Type=="Hybrid", 1,0)
mec_statKW2$FTPetr <- ifelse(mec_statKW2$Fuel_Type=="Petrol", 1,0)
mec_statKW2$FTDiesel <- ifelse(mec_statKW2$Fuel_Type=="Diesel", 1,0)
mec_statKW2 <- mec_statKW2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))
#QATAR
mec_statQT2$DrivingFWD <- ifelse(mec_statQT2$Driving=="Front Wheel Drive", 1,0)
mec_statQT2$DrivingAWD <- ifelse(mec_statQT2$Driving=="All Wheel Drive", 1,0)
mec_statQT2$DrivingRWD <- ifelse(mec_statQT2$Driving=="Rear Wheel Drive", 1,0)
mec_statQT2$TransMan <- ifelse(mec_statQT2$Transmission=="Manual", 1,0)
mec_statQT2$TransCVT <- ifelse(mec_statQT2$Transmission=="CVT", 1,0)
mec_statQT2$TransAuto <- ifelse(mec_statQT2$Transmission=="Automatic", 1,0)
mec_statQT2$FTHybr <- ifelse(mec_statQT2$Fuel_Type=="Hybrid", 1,0)
mec_statQT2$FTPetr <- ifelse(mec_statQT2$Fuel_Type=="Petrol", 1,0)
mec_statQT2$FTDiesel <- ifelse(mec_statQT2$Fuel_Type=="Diesel", 1,0)
mec_statQT2 <- mec_statQT2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))
#OMAN
mec_statOM2$DrivingFWD <- ifelse(mec_statOM2$Driving=="Front Wheel Drive", 1,0)
mec_statOM2$DrivingAWD <- ifelse(mec_statOM2$Driving=="All Wheel Drive", 1,0)
mec_statOM2$DrivingRWD <- ifelse(mec_statOM2$Driving=="Rear Wheel Drive", 1,0)
mec_statOM2$TransMan <- ifelse(mec_statOM2$Transmission=="Manual", 1,0)
mec_statOM2$TransCVT <- ifelse(mec_statOM2$Transmission=="CVT", 1,0)
mec_statOM2$TransAuto <- ifelse(mec_statOM2$Transmission=="Automatic", 1,0)
mec_statOM2$FTHybr <- ifelse(mec_statOM2$Fuel_Type=="Hybrid", 1,0)
mec_statOM2$FTPetr <- ifelse(mec_statOM2$Fuel_Type=="Petrol", 1,0)
mec_statOM2$FTDiesel <- ifelse(mec_statOM2$Fuel_Type=="Diesel", 1,0)
mec_statOM2 <- mec_statOM2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))
#SAUDI ARABIA
mec_statSA2$DrivingFWD <- ifelse(mec_statSA2$Driving=="Front Wheel Drive", 1,0)
mec_statSA2$DrivingAWD <- ifelse(mec_statSA2$Driving=="All Wheel Drive", 1,0)
mec_statSA2$DrivingRWD <- ifelse(mec_statSA2$Driving=="Rear Wheel Drive", 1,0)
mec_statSA2$TransMan <- ifelse(mec_statSA2$Transmission=="Manual", 1,0)
mec_statSA2$TransCVT <- ifelse(mec_statSA2$Transmission=="CVT", 1,0)
mec_statSA2$TransAuto <- ifelse(mec_statSA2$Transmission=="Automatic", 1,0)
mec_statSA2$FTHybr <- ifelse(mec_statSA2$Fuel_Type=="Hybrid", 1,0)
mec_statSA2$FTPetr <- ifelse(mec_statSA2$Fuel_Type=="Petrol", 1,0)
mec_statSA2$FTDiesel <- ifelse(mec_statSA2$Fuel_Type=="Diesel", 1,0)
mec_statSA2 <- mec_statSA2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))
#UNITED ARAB EMIRATES
mec_statUAE2$DrivingFWD <- ifelse(mec_statUAE2$Driving=="Front Wheel Drive", 1,0)
mec_statUAE2$DrivingAWD <- ifelse(mec_statUAE2$Driving=="All Wheel Drive", 1,0)
mec_statUAE2$DrivingRWD <- ifelse(mec_statUAE2$Driving=="Rear Wheel Drive", 1,0)
mec_statUAE2$TransMan <- ifelse(mec_statUAE2$Transmission=="Manual", 1,0)
mec_statUAE2$TransCVT <- ifelse(mec_statUAE2$Transmission=="CVT", 1,0)
mec_statUAE2$TransAuto <- ifelse(mec_statUAE2$Transmission=="Automatic", 1,0)
mec_statUAE2$FTHybr <- ifelse(mec_statUAE2$Fuel_Type=="Hybrid", 1,0)
mec_statUAE2$FTPetr <- ifelse(mec_statUAE2$Fuel_Type=="Petrol", 1,0)
mec_statUAE2$FTDiesel <- ifelse(mec_statUAE2$Fuel_Type=="Diesel", 1,0)
mec_statUAE2 <- mec_statUAE2 %>% select(-c(Driving, Transmission, Fuel_Type, PriceEURO))
Pca 3D plot for fun:
pcatest <- mec_statBR2 %>% prcomp(scale. = TRUE, center = TRUE)
#3D plot just for fun, not rendered in the html document unfortunately
scores=pcatest$x[,1:3] %>% as.data.frame()
plot3d(scores)
text3d(pcatest$rotation[,1:3]*1000,
texts=rownames(pcatest$rotation),
col="red",
cex=0.8)
coords <- c()
for (i in 1:nrow(pcatest$rotation)) {
coords <- rbind(coords, rbind(c(0,0,0),pcatest$rotation[i,1:3]))
}
lines3d(coords*1000,
col="orange",
lwd=1)
PCA function:
Here we apply the Principal component analysis and we select up to the 7th principal component which is the last one with eigenvalues > 1 meaning that each principal component explains at least as much information as one column in the original datasets.
pca<- function(ds, country){
pca1 <- ds %>% prcomp(scale. = TRUE, center = TRUE)
var <- pca1 %>% get_pca_var()
corplot<-corrplot(var$cor[,1:7], is.corr = FALSE, col.lim = c(-1,1),cl.align.text = "l",tl.col = "black" )
contrib<-pca1 %>% fviz_contrib(choice = "var", axes = 1)+ggtitle(paste("Contribution of variables to 1st Principal component - ",country))
pca1 %>% fviz_pca_var(col.var = pca1$rotation[,3], repel = TRUE)+
ggtitle(paste(country, " PCA"))+
scale_color_gradient2(name="Correlation with\nthe third component",limits=c(-1,1), low="red3",mid="deepskyblue",high="navy") #circle correlation plot
return(contrib)
}
PCA results:
pca(mec_statBR2, "Bahrain")
pca(mec_statKW2, "Kuwait")
pca(mec_statOM2, "Oman")
pca(mec_statQT2, "Qatar")
pca(mec_statSA2, "Saudi Arabia")
pca(mec_statUAE2, "United Arab Emirates")
pr_lm <- function(ds_pc,ds_original, request="default"){
if (is.element("PriceEURO", ds_pc %>% colnames())){
PriceEURO<- ds_pc$PriceEURO
ds_pc$PriceEURO<-NULL
}
#PC computation
pc<-ds_pc %>% prcomp(scale. = TRUE, center=TRUE)
sign_var<-pc$sdev %>% data.frame() %>% filter(.>1) %>% dim() %>% .[1]
pc<-pc$x[,1:sign_var] %>% data.frame()
if (class(ds_original)=="data.frame"){
pc$PriceEURO<-ds_original$PriceEURO
} else {
pc$PriceEURO<-PriceEURO
}
if (request=="summary"){
lm_pc=pc %>% lm(log(PriceEURO)~.,.)
lm_pc %>% summary() %>% return()
} else if (request=="tree"){
tree_constructor(pc)
}else if (request=="rf") {
rf_model(pc,"Whatever")
} else {
set.seed(1)
index <- sample(2, nrow(pc), prob = c(0.8, 0.2), replace = TRUE) #so we give to each observation a prob of 80% of falling into the train and 20% of falling into the test
Train <- pc[index==1, ] # Train data
Test <- pc[index==2, ]
lm_pc=Train %>% lm(log(PriceEURO)~.,.)
yhat <- predict(lm_pc, newdata=Test)
#in this case you return the R^2
return(1-(
(sum((log(Test$PriceEURO)-yhat)^2))/
(sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))))
}
}
pr_lm(mec_statBR2, mec_statBR,"tree")
##
## The MSE is: 1.880744e-05
##
## R²: 0.798516
## model1_pr.variable.importance
## PC1 384.59112
## PC2 65.39607
## PC7 52.82062
## PC3 49.26478
## PC5 38.05101
## PC4 27.54177
## PC6 25.91887
## [1] 0.798516
pr_lm(mec_statKW2, mec_statKW,"tree")
##
## The MSE is: 0.0003342216
##
## R²: 0.8100026
## model1_pr.variable.importance
## PC1 223.97828
## PC2 61.00428
## PC4 48.99104
## PC3 46.18381
## PC5 45.43138
## PC7 38.46012
## PC6 34.32127
## [1] 0.8100026
pr_lm(mec_statQT2, mec_statQT,"tree")
##
## The MSE is: 9.983318e-05
##
## R²: 0.7812357
## model1_pr.variable.importance
## PC1 230.25238
## PC3 70.13930
## PC2 46.75305
## PC6 43.15769
## PC4 42.97652
## PC5 33.72889
## PC7 15.85542
## [1] 0.7812357
pr_lm(mec_statOM2, mec_statOM,"tree")
##
## The MSE is: 0.0002256637
##
## R²: 0.7608527
## model1_pr.variable.importance
## PC1 230.68425
## PC4 63.04105
## PC7 55.52803
## PC3 44.06926
## PC5 41.90118
## PC2 27.48985
## PC6 21.00088
## [1] 0.7608527
pr_lm(mec_statSA2, mec_statSA,"tree")
##
## The MSE is: 0.002080731
##
## R²: 0.7687297
## model1_pr.variable.importance
## PC1 215.952104
## PC3 42.810689
## PC5 41.759841
## PC2 39.029829
## PC4 38.591129
## PC6 16.792633
## PC7 4.970742
## [1] 0.7687297
pr_lm(mec_statUAE2, mec_statUAE,"tree")
##
## The MSE is: 0.0005434524
##
## R²: 0.7488691
## model1_pr.variable.importance
## PC1 299.15173
## PC2 63.85345
## PC3 55.68239
## PC4 51.89348
## PC5 43.68406
## PC7 39.75202
## PC6 17.61245
## [1] 0.7488691
pr_lm(mec_statBR2, mec_statBR,"summary")
##
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.34884 -0.15417 0.00088 0.16265 1.16977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.7508273 0.0089760 1197.724 < 2e-16 ***
## PC1 0.2541204 0.0032524 78.134 < 2e-16 ***
## PC2 -0.0817215 0.0045476 -17.970 < 2e-16 ***
## PC3 -0.0009839 0.0060863 -0.162 0.872
## PC4 0.0350355 0.0064834 5.404 8.21e-08 ***
## PC5 0.0806239 0.0078819 10.229 < 2e-16 ***
## PC6 -0.0079893 0.0084767 -0.942 0.346
## PC7 0.0565940 0.0086959 6.508 1.22e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2806 on 969 degrees of freedom
## Multiple R-squared: 0.8721, Adjusted R-squared: 0.8711
## F-statistic: 943.6 on 7 and 969 DF, p-value: < 2.2e-16
pr_lm(mec_statKW2, mec_statKW,"summary")
##
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.00627 -0.16322 0.00031 0.18788 0.89924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.733543 0.011865 904.616 < 2e-16 ***
## PC1 0.250790 0.004333 57.885 < 2e-16 ***
## PC2 -0.117592 0.005965 -19.713 < 2e-16 ***
## PC3 -0.015909 0.007977 -1.994 0.04656 *
## PC4 -0.050359 0.008234 -6.116 1.72e-09 ***
## PC5 0.075761 0.010386 7.294 9.47e-13 ***
## PC6 -0.036587 0.011247 -3.253 0.00121 **
## PC7 0.022133 0.011354 1.949 0.05172 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2935 on 604 degrees of freedom
## Multiple R-squared: 0.8643, Adjusted R-squared: 0.8628
## F-statistic: 549.7 on 7 and 604 DF, p-value: < 2.2e-16
pr_lm(mec_statQT2, mec_statQT,"summary")
##
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.95183 -0.14513 0.00807 0.16910 0.81265
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.838716 0.011233 964.897 < 2e-16 ***
## PC1 0.256159 0.004089 62.649 < 2e-16 ***
## PC2 -0.105311 0.005773 -18.241 < 2e-16 ***
## PC3 -0.003016 0.007439 -0.405 0.6853
## PC4 0.051006 0.007836 6.509 1.61e-10 ***
## PC5 0.079890 0.009573 8.345 4.97e-16 ***
## PC6 0.024388 0.010587 2.304 0.0216 *
## PC7 0.050646 0.010959 4.622 4.67e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2758 on 595 degrees of freedom
## Multiple R-squared: 0.8808, Adjusted R-squared: 0.8794
## F-statistic: 628.1 on 7 and 595 DF, p-value: < 2.2e-16
pr_lm(mec_statOM2, mec_statOM,"summary")
##
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.98362 -0.15319 0.00409 0.17384 0.87550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.873718 0.011253 966.280 < 2e-16 ***
## PC1 0.249612 0.004098 60.912 < 2e-16 ***
## PC2 0.104470 0.005802 18.006 < 2e-16 ***
## PC3 -0.060138 0.007678 -7.832 2.23e-14 ***
## PC4 -0.002090 0.008060 -0.259 0.79550
## PC5 0.072024 0.009444 7.627 9.66e-14 ***
## PC6 -0.029007 0.010415 -2.785 0.00552 **
## PC7 0.014708 0.010566 1.392 0.16446
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2756 on 592 degrees of freedom
## Multiple R-squared: 0.8755, Adjusted R-squared: 0.874
## F-statistic: 594.8 on 7 and 592 DF, p-value: < 2.2e-16
pr_lm(mec_statSA2, mec_statSA,"summary")
##
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.22289 -0.19099 0.00106 0.17930 2.31963
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.859775 0.013900 781.296 < 2e-16 ***
## PC1 0.257093 0.005087 50.536 < 2e-16 ***
## PC2 0.098398 0.006839 14.387 < 2e-16 ***
## PC3 -0.053020 0.008951 -5.923 5.53e-09 ***
## PC4 -0.064143 0.010956 -5.855 8.17e-09 ***
## PC5 -0.003164 0.011570 -0.273 0.785
## PC6 -0.068201 0.012374 -5.512 5.44e-08 ***
## PC7 0.057317 0.013740 4.172 3.51e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3304 on 557 degrees of freedom
## Multiple R-squared: 0.8379, Adjusted R-squared: 0.8358
## F-statistic: 411.2 on 7 and 557 DF, p-value: < 2.2e-16
pr_lm(mec_statUAE2, mec_statUAE,"summary")
##
## Call:
## lm(formula = log(PriceEURO) ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.02425 -0.17093 0.00908 0.18996 0.90457
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.760563 0.010930 984.463 < 2e-16 ***
## PC1 0.261156 0.003970 65.785 < 2e-16 ***
## PC2 0.111463 0.005655 19.711 < 2e-16 ***
## PC3 -0.045508 0.007398 -6.151 1.28e-09 ***
## PC4 0.032803 0.007847 4.180 3.27e-05 ***
## PC5 0.087480 0.009194 9.515 < 2e-16 ***
## PC6 -0.006546 0.009883 -0.662 0.508
## PC7 0.066866 0.010647 6.280 5.87e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2939 on 715 degrees of freedom
## Multiple R-squared: 0.8727, Adjusted R-squared: 0.8715
## F-statistic: 700.3 on 7 and 715 DF, p-value: < 2.2e-16
R2pr_lm<- c(pr_lm(mec_statBR2, mec_statBR),
pr_lm(mec_statKW2, mec_statKW),
pr_lm(mec_statQT2, mec_statQT),
pr_lm(mec_statOM2, mec_statOM),
pr_lm(mec_statSA2, mec_statSA),
pr_lm(mec_statUAE2, mec_statUAE))
R2pr_lm
## [1] 0.8709772 0.8297523 0.8581698 0.8348107 0.8206330 0.8653526
mec_map5 <- data.frame(region=c("Bahrain", "Kuwait", "Qatar", "Oman", "Saudi Arabia", "United Arab Emirates"),
R2pc_lm=R2pr_lm)
mapdata<-map_data("world")
mapdata <- left_join(mapdata, mec_map5, by="region")
mapdata<-mapdata %>% filter(!is.na(mapdata$R2pc_lm))
ggplot(mapdata, aes(x=long, y=lat, group=group))+
geom_polygon(aes(fill=R2pc_lm), color="black")+
scale_fill_gradient(name="R² for the linear model run on\nthe significant PCs\n", low="#CCFFFF", high="#000066",
limit=c(.82, .87),
breaks=seq(from=.82,
to=.87,
by=.01))+
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
rect = element_blank())
R2df$R2pca<-R2pr_lm
R2df <- R2df %>% select("region", "R2Tree", "R2rr", "R2pca", "R2rf", "R2best_Tr_rr")
R2df
## region R2Tree R2rr R2pca R2rf
## 1 Bahrain 0.8180443 0.8886621 0.8709772 0.9458579
## 2 Kuwait 0.7998163 0.8531936 0.8297523 0.9260215
## 3 Oman 0.8358280 0.8713075 0.8581698 0.9389488
## 4 Qatar 0.8542003 0.8709802 0.8348107 0.9434955
## 5 Saudi Arabia 0.7114432 0.8458653 0.8206330 0.9077597
## 6 United Arab Emirates 0.8046047 0.8914368 0.8653526 0.9359294
## R2best_Tr_rr
## 1 Robust Regression
## 2 Robust Regression
## 3 Robust Regression
## 4 Robust Regression
## 5 Robust Regression
## 6 Robust Regression
pr_lm(mec_statBR2, mec_statBR,"rf")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 500
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.04532261
## % Var explained: 92.58
## Mean squared error: 8.394225e-05
## R²: 0.9216441
pr_lm(mec_statKW2, mec_statKW,"rf")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 400
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 400
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.05230703
## % Var explained: 91.4
## Mean squared error: 0.0002257347
## R²: 0.9108987
pr_lm(mec_statQT2, mec_statQT,"rf")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 500
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.05767763
## % Var explained: 90.69
## Mean squared error: 4.920689e-05
## R²: 0.9224049
pr_lm(mec_statOM2, mec_statOM,"rf")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 500
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.05116926
## % Var explained: 91.57
## Mean squared error: 0.000670271
## R²: 0.9179071
pr_lm(mec_statSA2, mec_statSA,"rf")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 450
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 450
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.0658544
## % Var explained: 89.83
## Mean squared error: 6.453369e-05
## R²: 0.8802514
pr_lm(mec_statUAE2, mec_statUAE,"rf")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 400
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 400
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.05518666
## % Var explained: 91.86
## Mean squared error: 5.156505e-05
## R²: 0.9061594
mec_stat %>% select(-Horsepower, -Engine_Capacity) %>% tree_constructor()
##
## The MSE is: 0.0002739199
##
## R²: 0.8426183
## model1_pr.variable.importance
## Torque 1637.17940
## Acceleration_0100 1105.03200
## Top_Speed 1012.32237
## Cylinders 855.20211
## Wheelbase 650.83485
## Fuel_Capacity 641.48866
## Width 123.21335
## Length 35.35770
## Height 25.93947
## Trunk_Capacity 22.12323
## Liters_For_100km 22.09052
## [1] 0.8426183
I created a new function so that I re inserted the variable Area in the dataset. Not the smartest thing to do but it works.
rob_reg1<-function(ds){
set.seed(1)
index <- sample(2, nrow(ds), prob = c(0.8, 0.2), replace = TRUE)
#so we give to each observation a prob of 80% of falling into the train and 20% of falling into the test
PriceEURO<-ds$PriceEURO
Area_<-ds$Area
# difference with the rob_reg function !!
ds<-ds %>% select(where(is.numeric), -PriceEURO) %>% scale() %>% data.frame() %>% cbind(PriceEURO)
ds<- ds %>% cbind(Area_)
Train <- ds[index==1, ] # Train data
Test <- ds[index == 2, ] #Test data
model1<- Train %>% lmrob(log(PriceEURO)~.,.)
predicted<-predict(model1, newdata=Test)
cat("\nR²: ",
1-(
(sum((log(Test$PriceEURO)-predicted)^2))/
(sum((log(Test$PriceEURO)-mean(log(Test$PriceEURO)))^2))),"\n")
model1 %>% summary() %>% print()
model1$coefficients %>% sort(decreasing = TRUE)
}
mec_stat %>% select(-Horsepower, -Engine_Capacity) %>% rob_reg1()
##
## R²: 0.8744916
##
## Call:
## lmrob(formula = log(PriceEURO) ~ ., data = .)
## \--> method = "MM"
## Residuals:
## Min 1Q Median 3Q Max
## -1.2990007 -0.1589043 -0.0004993 0.1564809 2.4769161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.770095 0.008705 1237.285 < 2e-16 ***
## Cylinders 0.088369 0.012959 6.819 1.09e-11 ***
## Fuel_Capacity 0.114334 0.010999 10.395 < 2e-16 ***
## Liters_For_100km -0.033188 0.012012 -2.763 0.00576 **
## Torque 0.246575 0.015042 16.392 < 2e-16 ***
## Top_Speed 0.242983 0.011813 20.568 < 2e-16 ***
## Seating_Capacity -0.104550 0.009298 -11.244 < 2e-16 ***
## Acceleration_0100 -0.133719 0.010904 -12.264 < 2e-16 ***
## Length -0.019901 0.014475 -1.375 0.16929
## Width 0.042182 0.005506 7.661 2.42e-14 ***
## Height 0.085787 0.009158 9.367 < 2e-16 ***
## Wheelbase 0.108329 0.013731 7.890 4.12e-15 ***
## Trunk_Capacity -0.040947 0.006344 -6.454 1.25e-10 ***
## Area_Kuwait -0.014420 0.014269 -1.011 0.31228
## Area_Oman 0.026261 0.013936 1.884 0.05961 .
## Area_Qatar 0.019414 0.013896 1.397 0.16248
## Area_Saudi Arabia 0.126977 0.015734 8.070 9.80e-16 ***
## Area_UAE 0.062228 0.014477 4.298 1.77e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Robust residual standard error: 0.2302
## Multiple R-squared: 0.909, Adjusted R-squared: 0.9085
## Convergence in 14 IRWLS iterations
##
## Robustness weights:
## 4 observations c(21,936,1355,1495) are outliers with |weight| = 0 ( < 3.1e-05);
## 257 weights are ~= 1. The remaining 2973 ones are summarized as
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0181 0.8621 0.9507 0.8871 0.9853 0.9990
## Algorithmic parameters:
## tuning.chi bb tuning.psi refine.tol
## 1.548e+00 5.000e-01 4.685e+00 1.000e-07
## rel.tol scale.tol solve.tol zero.tol
## 1.000e-07 1.000e-10 1.000e-07 1.000e-10
## eps.outlier eps.x warn.limit.reject warn.limit.meanrw
## 3.092e-05 8.117e-12 5.000e-01 5.000e-01
## nResample max.it groups n.group best.r.s
## 500 50 5 400 2
## k.fast.s k.max maxit.scale trace.lev mts
## 1 200 200 0 1000
## compute.rd fast.s.large.n
## 0 2000
## psi subsampling cov
## "bisquare" "nonsingular" ".vcov.avar1"
## compute.outlier.stats
## "SM"
## seed : int(0)
## (Intercept) Torque Top_Speed Area_Saudi Arabia
## 10.77009494 0.24657548 0.24298325 0.12697726
## Fuel_Capacity Wheelbase Cylinders Height
## 0.11433437 0.10832881 0.08836917 0.08578749
## Area_UAE Width Area_Oman Area_Qatar
## 0.06222784 0.04218167 0.02626120 0.01941357
## Area_Kuwait Length Liters_For_100km Trunk_Capacity
## -0.01442017 -0.01990056 -0.03318842 -0.04094695
## Seating_Capacity Acceleration_0100
## -0.10454984 -0.13371887
mec_stat %>% select(-Horsepower, -Engine_Capacity) %>% rf_model(., "Big dataset")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 50
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 50
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.01995721
## % Var explained: 96.85
## Mean squared error: 8.83196e-05
## R²: 0.9696733
pcBR<- cbind(mec_statBR2, mec_statBR$PriceEURO)
pcBR$Area<-0
pcBR<-pcBR %>% rename("PriceEURO"="mec_statBR$PriceEURO")
pcKW<-cbind(mec_statKW2, mec_statKW$PriceEURO)
pcKW$Area<-1
pcKW<-pcKW %>% rename("PriceEURO"="mec_statKW$PriceEURO")
pcQT<-cbind(mec_statQT2, mec_statQT$PriceEURO)
pcQT$Area<-2
pcQT<-pcQT %>% rename("PriceEURO"="mec_statQT$PriceEURO")
pcOM<-cbind(mec_statOM2, mec_statOM$PriceEURO)
pcOM$Area<-3
pcOM<-pcOM %>% rename("PriceEURO"="mec_statOM$PriceEURO")
pcSA<-cbind(mec_statSA2, mec_statSA$PriceEURO)
pcSA$Area<-4
pcSA<-pcSA %>% rename("PriceEURO"="mec_statSA$PriceEURO")
pcUAE<-cbind(mec_statUAE2, mec_statUAE$PriceEURO)
pcUAE$Area<-5
pcUAE<-pcUAE %>% rename("PriceEURO"="mec_statUAE$PriceEURO")
mec_statPC <- rbind(pcBR, pcKW, pcQT, pcOM,pcSA,pcUAE)
mec_statPC %>% pr_lm(.,0,request="default")
## [1] 0.8605758
mec_statPC %>% pr_lm(.,0,request="tree")
##
## The MSE is: 2.870222e-05
##
## R²: 0.8008268
## model1_pr.variable.importance
## PC1 1532.48062
## PC3 298.94810
## PC4 279.02838
## PC2 201.41527
## PC5 185.15232
## PC6 129.45372
## PC7 90.63286
## [1] 0.8008268
mec_statPC %>% pr_lm(.,0,request="rf")
##
## The best nr of mtry for min error is: 4
## The best nr of trees for min error is: 400
## Call:
## randomForest(formula = log(PriceEURO) ~ ., data = Train, ntree = best_ntree, mtry = best_mtry_model$bestTune[1, 1], importance = TRUE)
## Type of random forest: regression
## Number of trees: 400
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.02866165
## % Var explained: 95.48
## Mean squared error: 3.447948e-06
## R²: 0.959543
pca(mec_statPC %>% select(-Horsepower, - Engine_Capacity, -PriceEURO), "General model")
mec_statPC %>% select(-Horsepower, - Engine_Capacity, -PriceEURO) %>% prcomp(scale. = TRUE, center = TRUE) %>% fviz_eig(choice = "variance",addlabels=TRUE,xlab = "Principal components",
main = "" ,ylab = "",ncp=7,barfill = "deepskyblue", linecolor = "blue") + theme_minimal()